In this notebook we will work through some of the initial examples in the NLTK book written by Steven Bird, Ewan Klein, and Edward Loper. You can follow along by going to the first chapter of the book:
In [ ]:
# First we import the NLTK library and download
# the data used in the examples in the book
# the data will be stored in a directory on the
# virtual machine but accessible through your notebooks
import nltk
nltk.download()
In [ ]:
from nltk.book import *
In [ ]:
text5.concordance("lol")
In [ ]:
text1.similar("monstrous")
In [ ]:
text2.common_contexts(["monstrous", "very"])
In [ ]:
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
In [ ]:
print('number of words:', len(text3))
In [ ]:
sorted(set(text3))[:20]
In [ ]:
len(set(text3))
In [ ]:
len(set(text3)) / len(text3)
In [ ]:
text3.count("smote")
In [ ]:
100 * text4.count('a') / len(text4)
In [ ]:
def lexical_diversity(text):
return len(set(text)) / len(text)
def percentage(count, total):
return 100 * count / total
In [ ]:
print(lexical_diversity(text3))
print(lexical_diversity(text5))
print(percentage(4, 5))
print(percentage(text4.count('a'), len(text4)))
In [ ]:
fdist1 = FreqDist(text1)
print(fdist1)
fdist1.most_common(50)
In [ ]:
fdist1.plot(50, cumulative=True)
In [ ]:
fdist1.hapaxes()
In [ ]:
V = set(text1)
long_words = [w for w in V if len(w) > 12 and fdist1[w] > 7]
sorted(long_words)
In [ ]:
list(nltk.bigrams(['more', 'is', 'said', 'than', 'done']))
In [ ]:
text4.collocations()
In [ ]:
text8.collocations()
In [ ]:
fdist = FreqDist(len(w) for w in text1)
fdist
In [ ]:
print(fdist.most_common())
print(fdist.max())
print(fdist[3])
print(fdist.freq(3))
In [ ]:
tricky = sorted(w for w in set(text2) if 'cie' in w or 'cei' in w)
for word in tricky:
print(word, end=' ')
Keep following along with the rest of the examples in the NLTK book to get familiar with a variety of Natural Language Processing (NLP) techniques using Python and the NLTK library.